from gensim.models import Word2Vec
import _pickle as pickle
import tweepy
def load_obj(name ):
with open( 'output/'+ name + '.pkl', 'rb') as f:
return pickle.load(f)
print("Loading Word Vector Model...")
model = Word2Vec.load("output/400features_30minwords_10context_twitter")
print("Loading Second Word Vector Model...")
model2 = Word2Vec.load("output/400features_30minwords_10context_twitter")
print("Loading Word Centroid Map...")
word_centroid_map = load_obj("twitter_word_centroid_map")
print("Loading Trained Random Forest Classifier...")
load_forest = load_obj("twitter_forest")
print("Setting up Twitter Authentication...")
consumer_key = "GlYCSvDgUet79gori1M5rxmMW"
consumer_secret = "JRNb6FIjsSMOu6CU4QRMdJ1kMsVd7IF6g9PnKgD2qrdeva2iFY"
access_token = "259205396-ZWx5lQCRzy5GMnzmNTIQzMckDqRnzjfVnoFu0VgG"
access_token_secret = "eBx4oQQYHhXRgQE6cOioMSUhzpLWNLc8c2hgL4GGmG2Kd"
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
import re
def process_tweet( tweet , punctuation=False):
tweet = re.sub('@[^\s]+','',tweet)
tweet = re.sub('((www\.[\s]+)|(https?:/?/?[^\s]+))','',tweet)
tweet = tweet.replace('RT','')
tweet = tweet.replace('#','')
if punctuation:
tweet = tweet.replace('.','')
tweet = tweet.replace(',','')
tweet = tweet.replace('?','')
tweet = tweet.replace('!','')
words = tweet.lower().split()
return( words)
word_vectors = model.wv.syn0
num_clusters = int(word_vectors.shape[0] / 10)
def create_bag_of_centroids( wordlist, word_centroid_map ):
num_centroids = max( word_centroid_map.values() ) + 1
bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
for word in wordlist:
if word in word_centroid_map:
index = word_centroid_map[word]
bag_of_centroids[index] += 1
return bag_of_centroids
for item in model.most_similar("awful"):
print(item[0]," ",end="")
print()
for cluster in range(77,80):
print ("\nCluster %d" % cluster)
words = []
for i in range(0,len(word_centroid_map.values())):
if( list(word_centroid_map.values())[i] == cluster ):
words.append(list(word_centroid_map.keys())[i])
print (words)
query = "Star Wars"
max_tweets = 500
print ("Loading Tweets...")
searched_tweets = [status.text for status in tweepy.Cursor(api.search, q=query, lang="en").items(max_tweets)]
import numpy as np
print ("Pre-allocating an Array...")
user_centroids = np.zeros( (max_tweets, num_clusters), \
dtype="float32" )
print ("Producing Test Centroids...")
counter = 0
for tweet in searched_tweets:
user_centroids[counter] = create_bag_of_centroids( process_tweet( tweet, True ), word_centroid_map )
counter += 1
print ("Predicting Test Sets...")
result = load_forest.predict(user_centroids)
unique, counts = np.unique(result, return_counts=True)
result_dict = dict(zip(unique, counts))
print ("\nPrediction :")
print (" Positive - %.2f%%,\n Negative - %.2f%%" %
(result_dict.get(4, 0)*100/len(result),\
result_dict.get(0, 0)*100/len(result)))
def switch(x):
if x == 0:
return 'Negative'
else:
return 'Positive'
formatted_result = list(map(switch, result))
output = list(zip(formatted_result,searched_tweets))
for item in output[0:10]:
print(item[0]+"\n"+item[1]+"\n")